Get the data¶

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
df=pd.read_table('fruit_data_with_colors.txt')
In [3]:
df.head()
Out[3]:
fruit_label fruit_name fruit_subtype mass width height color_score
0 1 apple granny_smith 192 8.4 7.3 0.55
1 1 apple granny_smith 180 8.0 6.8 0.59
2 1 apple granny_smith 176 7.4 7.2 0.60
3 2 mandarin mandarin 86 6.2 4.7 0.80
4 2 mandarin mandarin 84 6.0 4.6 0.79
In [4]:
df.fruit_name.unique()
Out[4]:
array(['apple', 'mandarin', 'orange', 'lemon'], dtype=object)
In [5]:
df.fruit_subtype.unique()
Out[5]:
array(['granny_smith', 'mandarin', 'braeburn', 'golden_delicious',
       'cripps_pink', 'spanish_jumbo', 'selected_seconds', 'turkey_navel',
       'spanish_belsan', 'unknown'], dtype=object)
In [6]:
df.shape
Out[6]:
(59, 7)
In [7]:
df.describe()
Out[7]:
fruit_label mass width height color_score
count 59.000000 59.000000 59.000000 59.000000 59.000000
mean 2.542373 163.118644 7.105085 7.693220 0.762881
std 1.208048 55.018832 0.816938 1.361017 0.076857
min 1.000000 76.000000 5.800000 4.000000 0.550000
25% 1.000000 140.000000 6.600000 7.200000 0.720000
50% 3.000000 158.000000 7.200000 7.600000 0.750000
75% 4.000000 177.000000 7.500000 8.200000 0.810000
max 4.000000 362.000000 9.600000 10.500000 0.930000
In [8]:
df.describe().max()
Out[8]:
fruit_label     59.0
mass           362.0
width           59.0
height          59.0
color_score     59.0
dtype: float64
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fruit_label    59 non-null     int64  
 1   fruit_name     59 non-null     object 
 2   fruit_subtype  59 non-null     object 
 3   mass           59 non-null     int64  
 4   width          59 non-null     float64
 5   height         59 non-null     float64
 6   color_score    59 non-null     float64
dtypes: float64(3), int64(2), object(2)
memory usage: 3.4+ KB

Prepare the Data¶

In [10]:
q_low = df["width"].quantile(0.25)
q_hi  = df["width"].quantile(0.75)

df_filtered = df[(df["width"] < q_hi) & (df["width"] > q_low)]
In [11]:
plt.scatter(df_filtered['mass'],df_filtered['width'])
Out[11]:
<matplotlib.collections.PathCollection at 0x7fd13d0df310>
In [12]:
plt.plot(df['height'],label='height')
plt.plot(df['mass'],label='mass')
plt.plot(df['width'],label='width')
plt.plot(df['color_score'],label='color_score')
plt.legend()
Out[12]:
<matplotlib.legend.Legend at 0x7fd13d204f40>
In [13]:
plt.plot(df['width'],label='width')
plt.plot(df['color_score'],label='color_score')
plt.plot(df['height'],label='height')
plt.legend()
Out[13]:
<matplotlib.legend.Legend at 0x7fd13d2a2fd0>
In [14]:
#lets play with  plotly
import plotly.express as px

fig = px.scatter_3d(df, x='mass', y='width', z='height',
              color='color_score',symbol='fruit_name',opacity=0.7)
fig.show()
In [15]:
lookup_fruits_name = dict(zip(df.fruit_label.unique(),df.fruit_name.unique()))
lookup_fruits_name
Out[15]:
{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}
In [16]:
df.isna().sum()
Out[16]:
fruit_label      0
fruit_name       0
fruit_subtype    0
mass             0
width            0
height           0
color_score      0
dtype: int64
In [17]:
df.duplicated().sum()
Out[17]:
0
In [18]:
#df.drop_duplicates()
#df.dropna()
In [19]:
## Position of the Outlier
#print(np.where(df['Mass']>10))
In [20]:
# Analysing the co-relation between different features 
df.corr()
Out[20]:
fruit_label mass width height color_score
fruit_label 1.000000 0.032738 -0.298090 0.508766 -0.310521
mass 0.032738 1.000000 0.877687 0.609571 -0.079794
width -0.298090 0.877687 1.000000 0.396848 -0.076576
height 0.508766 0.609571 0.396848 1.000000 -0.247047
color_score -0.310521 -0.079794 -0.076576 -0.247047 1.000000
In [21]:
df.tail()
Out[21]:
fruit_label fruit_name fruit_subtype mass width height color_score
54 4 lemon unknown 116 6.1 8.5 0.71
55 4 lemon unknown 116 6.3 7.7 0.72
56 4 lemon unknown 116 5.9 8.1 0.73
57 4 lemon unknown 152 6.5 8.5 0.72
58 4 lemon unknown 118 6.1 8.1 0.70
In [22]:
df["fruit_subtype"]=="unknown"
Out[22]:
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49     True
50     True
51     True
52     True
53     True
54     True
55     True
56     True
57     True
58     True
Name: fruit_subtype, dtype: bool
In [23]:
(df["fruit_subtype"]=="unknown").sum()
Out[23]:
10
In [24]:
# Column which is not contributing we have to drop it 
df.drop('fruit_subtype',axis=1,inplace=True)
In [25]:
df.head()
Out[25]:
fruit_label fruit_name mass width height color_score
0 1 apple 192 8.4 7.3 0.55
1 1 apple 180 8.0 6.8 0.59
2 1 apple 176 7.4 7.2 0.60
3 2 mandarin 86 6.2 4.7 0.80
4 2 mandarin 84 6.0 4.6 0.79

Split the data¶

In [26]:
X = df[['mass', 'width', 'height', 'color_score']]
y = df['fruit_label']
In [27]:
df.shape
Out[27]:
(59, 6)
In [28]:
#Select the ratio
ratio = 0.75
total_rows = df.shape[0]
train_size = int(total_rows*ratio)
In [29]:
# Split data into test and train
X_train = X[0:train_size]
X_test = X[train_size:]
In [41]:
X_train.shape,X_test.shape
Out[41]:
((44, 4), (15, 4))
In [42]:
# Split data into test and train
y_train = y[0:train_size]
y_test = y[train_size:]
In [43]:
from sklearn.model_selection import train_test_split
In [54]:
X = df[['mass', 'width', 'height', 'color_score']]
y = df['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.75,random_state=0)
In [45]:
# Doing Standardization for increasing the speed of our model by reducing distance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test =scaler.transform(X_test)

Select the Algorithm¶

In [46]:
# Importing our classification model
# here n_neighbors denotes the no. of k , we have taken it as 5 because it is the most used one.
# Then train the model
from sklearn.neighbors import KNeighborsClassifier
knmodel = KNeighborsClassifier(n_neighbors= 5)
knmodel.fit(X_train,y_train)
Out[46]:
KNeighborsClassifier()
In [47]:
# predicted the values with the help of testing dataset
y_pred = knmodel.predict(X_test)
In [49]:
print(knmodel.predict(X_test))
[3 3 4 3 1 1 3 4 3 1 2 1 3 3 1]
In [50]:
print(y_pred[0])
3
In [51]:
lookup_fruits_name[y_pred[0]]
Out[51]:
'orange'
In [52]:
#optional
# I have used in order to see whether they are overlapping or not 
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(y_pred, y_test)
Out[52]:
<matplotlib.collections.PathCollection at 0x7fd1441b86a0>
In [40]:
# Although we can check our accuracy in our above step but if you want to check it more precisly
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)*100
Out[40]:
93.33333333333333
In [ ]: